# import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
plt.style.use('ggplot')
# importing the data set
df = pd.read_csv('C:\\Users\\smile\\Desktop\\Prasfur\\Project 1 - FIFA\\Data\\FIFA 19.csv')
# exploring the data set
df.head()
df.shape
# Listing out the columns
df.columns
# Remove unwanted columns
df.drop(['Unnamed: 0'],axis=1,inplace=True)
# Listing info of the data set
df.info()
# Checking the number of missing values
df.isnull().sum()
#Checking extent of null values
sns.heatmap(df.isnull(),yticklabels=False)
# Since the column loaned from almost has no values, we will drop it
df.drop(['Loaned From'],axis = 1, inplace=True)
#now the data which have NA values, we will fill them with the mean value of that column
df.fillna(df.mean(),inplace=True)
df.isnull().sum()
#Checking extent of null values
sns.heatmap(df.isnull(),yticklabels=False)
#there are still cells in which the mean value could not be assigned. This may be because those columns have strings. So we will assign a value "Unassigned" to the dataset
df.fillna("Unassigned",inplace=True)
df.isnull().sum()
df.dtypes
#Displaying all columns
df.keys()
# Final data clean-up procedure
df.drop(['Photo','Flag','Club Logo','Real Face','Special'],axis=1,inplace=True)
df.head()
df.isnull()
df.isnull().sum()
#Checking extent of null values
sns.heatmap(df.isnull(),yticklabels=False)
# Getting insights from dataset
df.describe()
# Age distribution of players
# Histogram: number of players's age
sns.set(style ="dark", palette="colorblind", color_codes=True)
x = df.Age
plt.figure(figsize=(12,8))
ax = sns.distplot(x, bins = 58, kde = False, color='g')
ax.set_xlabel(xlabel="Player\'s age", fontsize=16)
ax.set_ylabel(ylabel='Number of players', fontsize=16)
ax.set_title(label='Histogram of players age', fontsize=20)
plt.show()
oldest = df.loc[df['Age'].idxmax()]
print("The oldest player in FIFA 19 is", df['Age'].max(), "years old. His name is", oldest['Name'],
'he is from',oldest['Nationality'],'and plays for',oldest['Club'],'.')
print('The median age of a player on FIFA 19 is', np.mean(df['Age']))
youngest = df.loc[df['Age'].idxmin()]
print('The youngest players is',df['Age'].min(), "years old. His name is", youngest['Name'],
'he is from',youngest['Nationality'],'and plays for',youngest['Club'],'.')
# Overall rating distribution of players
plt.hist(df['Overall'])
plt.xlabel('Players Rating ')
plt.ylabel('Number of players')
plt.show()
best = df.loc[df['Overall'].idxmax()]
print("The best player in FIFA 19 is", df['Overall'].max(), "overall. His name is", best['Name'],
'he is from',best['Nationality'],'and plays for',best['Club'],'.')
print('The median rating of a player on FIFA 19 is', np.mean(df['Overall']))
worst = df.loc[df['Overall'].idxmin()]
print('The worst players is',df['Overall'].min(), "overall. His name is", worst['Name'],
'he is from',worst['Nationality'],'and plays for',worst['Club'],'.')
# Distribution of Player's potential rating
plt.hist(df['Potential'], color = 'blue')
plt.xlabel('Players Potential')
plt.ylabel('Number of players')
plt.show()
bestp = df.loc[df['Potential'].idxmax()]
print("The best potential player in FIFA 19 is", df['Potential'].max(), "overall. His name is", bestp['Name'],
'he is from',bestp['Nationality'],'and plays for',bestp['Club'],'.')
print('The median potential rating of a player on FIFA 19 is', np.mean(df['Potential']))
worstp = df.loc[df['Potential'].idxmin()]
print('The worst potential player is',df['Potential'].min(), "overall. His name is", worstp['Name'],
'he is from',worstp['Nationality'],'and plays for',worstp['Club'],'.')
# Displaying the Age vs Overall performance
plt.figure(figsize=(10, 5))
sns.regplot(df['Age'] , df['Overall'])
plt.title('Age vs Overall rating')
plt.show()
# Displaying the Age vs Potential performance
plt.figure(figsize=(10, 5))
sns.regplot(df['Age'],df['Potential'],color = 'green')
plt.title('Age vs Potential rating')
plt.show()
# Plotting the heatmap of the data set
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(),linewidths=3)
plt.title('Dataset Heatmap')
plt.show()
# Preferred foot analysis
plt.figure(figsize=(5,5))
sns.countplot(df['Preferred Foot'])
# Displaying the Top 25 players
df_best_players = pd.DataFrame.copy(df.sort_values(by = 'Overall' ,
ascending = False ).head(25))
plt.figure(figsize=(20, 10))
plt.bar('Name' , 'Overall' , data = df_best_players, width=0.5, color = 'Purple')
plt.xlabel('Players names', fontsize=30)
plt.xticks(rotation = 90,fontsize=20, fontname='monospace')
plt.ylabel('Overall Rating', fontsize=30)
plt.title('Top 25 players Overall Rating', fontsize=40)
plt.ylim(87 , 95)
plt.show()
#Stamina vs Sprint Speed Plot
df.plot(kind = 'scatter' , x='Stamina' , y = 'SprintSpeed' , alpha = .5 )
plt.xlabel('Stamina')
plt.ylabel('Sprint Speed')
plt.title('Stamina-Sprint Speed Scatter Plot')
plt.show()
# Displaying the relation between Age & Sprint Speed
plt.figure(1,figsize=(10,5))
sns.regplot(df['Age'],df['SprintSpeed'], color = 'Green')
plt.title('Age vs Sprint Speed')
plt.show()
# Displaying the relation between Overall & Potential
df.groupby('Overall')['Potential'].mean().plot()
plt.title('Overall vs Potential')
plt.ylabel("Potential",rotation=90)
plt.show()
#count of players Position Wise
plt.figure(1,figsize=(20,10))
p = sns.countplot(x = 'Position', data = df,palette='inferno_r')
p.set_title(label='Count of Players', fontsize=25)
#Cleaning some of values so that we can interpret them
def value_to_int(df_value):
try:
value = float(df_value[1:-1])
suffix = df_value[-1:]
if suffix == 'M':
value = value * 1000000
elif suffix == 'K':
value = value * 1000
except ValueError:
value = 0
return value
df['Value'] = df['Value'].apply(value_to_int)
df['Wage'] = df['Wage'].apply(value_to_int)
# Displaying the Values of players
df1 = pd.DataFrame.copy(df.sort_values(by = 'Value' , ascending = False ).head(25))
plt.figure(figsize=(20, 5))
plt.bar('Name' , 'Value' , data = df1, width=0.5, color = 'Orange')
plt.xlabel('Players names', fontsize=30)
plt.xticks(rotation = 90,fontsize=20, fontname='monospace')
plt.ylabel('Value', fontsize=30)
plt.title('Value of Players(in Millions)', fontsize=30)
plt.show()
# Distribution of Wage of Players
df2 = pd.DataFrame.copy(df.sort_values(by = 'Wage' , ascending = False ).head(25))
plt.figure(figsize=(20, 5))
plt.bar('Name' , 'Wage' , data = df2, width=0.5, color = 'Red')
plt.xlabel('Players names', fontsize=30)
plt.xticks(rotation = 90,fontsize=20, fontname='monospace')
plt.ylabel('Wage', fontsize=30)
plt.title('Wage of Players(in Millions)', fontsize=30)
plt.show()
# Distribution of Jersey Number
df['Jersey Number'].plot(kind = 'hist',bins= 320, color= 'blue', label = 'Jersey Number',
alpha = 1.0, grid = True, figsize = (10,5))
plt.legend
plt.xlabel('Number')
plt.ylabel('Players')
plt.title('Average Jersey Number')
plt.show()
# Distribution of Overall of players
df.Overall.plot(kind = 'hist',bins= 400, color= 'green',
label = 'Overall', alpha = 1.0, grid = False, figsize = (10,5))
plt.legend
plt.xlabel('Overall')
plt.ylabel('Players')
plt.title('Average Overall')
plt.show()
# Relation between Shotpower & Finishing
df.plot(kind = 'scatter', x='ShotPower', y='Finishing', alpha = 1.0, color = 'yellow')
plt.xlabel('ShotPower')
plt.ylabel('Finishing')
plt.title('ShotPower vs Finishing Scatter Plot')
# Relation between Composure & Penalties
df.plot(kind = 'scatter', x = 'Composure', y = 'Penalties', alpha = 1.0, color = 'Orange')
plt.xlabel('Composure')
plt.ylabel('Penalties')
plt.title('Composure vs Penalties Scatter plot')
# Relation between Ball Control and Short Passing
df.plot(kind = 'scatter', x = 'BallControl', y = 'ShortPassing',alpha = 1.0, color = 'Brown')
plt.xlabel('BallControl')
plt.ylabel('ShortPassing')
plt.title('BallControl vs ShortPassing Scatter Plot')
# Relation between Sliding tackle & Interceptions
plt.figure(1,figsize=(10,5))
sns.regplot(df['SlidingTackle'],df['Interceptions'], color = 'Green')
plt.title('Sliding Tackle vs Interceptions Speed')
plt.show()
# Relation between Gk Diving & GK Positioning
plt.figure(figsize = (10,5))
sns.regplot(df['GKDiving'], df['GKPositioning'], color = 'Purple')
plt.title('GK Diving vs GK Positioning')
plt.show()
# The clubs and their players overalls
clubs = ('Juventus', 'Real Madrid', 'Paris Saint-Germain', 'FC Barcelona', 'Liverpool',
'Manchester United', 'FC Bayern München', 'Manchester City', 'Napoli')
plt.figure(figsize = (15,5))
df_club = df.loc[df['Club'].isin(clubs) & df['Age'] & df['Overall'] ]
ax = sns.barplot(x=df_club['Club'], y=df_club['Overall'], palette="rocket")
ax.set_title(label='Distribution overall in several clubs', fontsize=20);
#best players per each position with their age, club, and nationality based on their overall scores
df.iloc[df.groupby(df['Position'])['Overall'].idxmax()][['Position', 'Name', 'Age', 'Club', 'Nationality']]
# Best features of players
pr_cols=['Crossing', 'Finishing', 'HeadingAccuracy', 'ShortPassing', 'Volleys',
'Dribbling', 'Curve', 'FKAccuracy', 'LongPassing', 'BallControl',
'Acceleration', 'SprintSpeed', 'Agility', 'Reactions', 'Balance',
'ShotPower', 'Jumping', 'Stamina', 'Strength', 'LongShots',
'Aggression', 'Interceptions', 'Positioning', 'Vision', 'Penalties',
'Composure', 'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiving',
'GKHandling', 'GKKicking', 'GKPositioning', 'GKReflexes']
i=0
while i < len(pr_cols):
print('Best {0} : {1}'.format(pr_cols[i],df.loc[df[pr_cols[i]].idxmax()][1]))
i += 1
# Height of Players
plt.figure(figsize = (13, 8))
ax = sns.countplot(x = 'Height', data = df, palette = 'dark')
ax.set_title(label = 'Count of players on Basis of Height', fontsize = 20)
ax.set_xlabel(xlabel = 'Height in Foot per inch', fontsize = 16)
ax.set_ylabel(ylabel = 'Count', fontsize = 16)
plt.show()
# best players from each positions with their age, nationality, club based on their potential scores
df.iloc[df.groupby(df['Position'])['Potential'].idxmax()][['Position', 'Name', 'Age', 'Club', 'Nationality']]
# Every Nations' Player and their overall scores
some_countries = ('England', 'Germany', 'Spain', 'Argentina', 'France', 'Brazil', 'Italy', 'Columbia')
df_countries = df.loc[df['Nationality'].isin(some_countries) & df['Overall']]
plt.figure(figsize = (10,5))
ax = sns.barplot(x = df_countries['Nationality'], y = df_countries['Overall'], palette = 'colorblind')
ax.set_xlabel(xlabel = 'Countries', fontsize = 15)
ax.set_ylabel(ylabel = 'Overall Scores', fontsize = 15)
ax.set_title(label = 'Distribution of overall scores of players from different countries', fontsize = 20)
plt.show()
some_clubs = ('Manchester United', 'Liverpool', 'Juventus', 'Napoli', 'Arsenal', 'Manchestar City',
'Tottenham Hotspur', 'FC Barcelona', 'Valencia CF', 'Chelsea', 'Real Madrid')
data_clubs = df.loc[df['Club'].isin(some_clubs) & df['Overall']]
plt.figure(figsize = (20,5))
ax = sns.barplot(x = data_clubs['Club'], y = data_clubs['Overall'], palette = 'deep')
ax.set_xlabel(xlabel = 'Some Popular Clubs', fontsize = 15)
ax.set_ylabel(ylabel = 'Overall Score', fontsize = 15)
ax.set_title(label = 'Distribution of Overall Score in Different popular Clubs', fontsize = 20)
plt.show()
# defining the features of players
player_features = ('Acceleration', 'Aggression', 'Agility',
'Balance', 'BallControl', 'Composure',
'Crossing', 'Dribbling', 'FKAccuracy',
'Finishing', 'GKDiving', 'GKHandling',
'GKKicking', 'GKPositioning', 'GKReflexes',
'HeadingAccuracy', 'Interceptions', 'Jumping',
'LongPassing', 'LongShots', 'Marking', 'Penalties')
# Top four features for every position in football
for i, val in df.groupby(df['Position'])[player_features].mean().iterrows():
print('Position {}: {}, {}, {}'.format(i, *tuple(val.nlargest(4).index)))
# Top five the most expensive clubs
df.groupby(['Club'])['Value'].sum().sort_values(ascending = False).head()
# Top five the less expensive clubs
df.groupby(['Club'])['Value'].sum().sort_values().head()
# Top five teams with the best players
df.groupby(['Club'])['Overall'].max().sort_values(ascending = False).head()
# Relation between Age & Reactions
plt.figure(1,figsize=(10,5))
sns.regplot(df['Age'],df['Reactions'], color = 'blue')
plt.title('Age vs Reactions')
plt.show()
# Relation between Age & Shotpower
plt.figure(1,figsize=(10,5))
sns.regplot(df['Age'],df['ShotPower'], color = 'green')
plt.title('Age vs Shotpower')
plt.show()
# Relation between Age & Jumping
plt.figure(1,figsize=(10,5))
sns.regplot(df['Age'],df['Jumping'], color = 'Orange')
plt.title('Age vs Jumping')
plt.show()
# Relation between Age & SprintSpeed
plt.figure(1,figsize=(10,5))
sns.regplot(df['Age'],df['SprintSpeed'], color = 'Purple')
plt.title('Age vs SprintSpeed')
plt.show()
# Relation between Age & Stamina
plt.figure(1,figsize=(10,5))
sns.regplot(df['Age'],df['Stamina'], color = 'Brown')
plt.title('Age vs Stamina')
plt.show()
# Relation between Age & Agility
plt.figure(1,figsize=(10,5))
sns.regplot(df['Age'],df['Agility'], color = 'Yellow')
plt.title('Age vs Agility')
plt.show()
# Relation between Age & Strength
plt.figure(1,figsize=(10,5))
sns.regplot(df['Age'],df['Strength'], color = 'aqua')
plt.title('Age vs Strength')
plt.show()
# Relation between Age & Vision
plt.figure(1,figsize=(10,5))
sns.regplot(df['Age'],df['Vision'], color = 'navy')
plt.title('Age vs Vision')
plt.show()
# Distribution of players according to their Overall
plt.figure(figsize = (20,10))
sns.countplot(df['Overall'], palette='rocket')
plt.show()
# Eldest players
df.sort_values(by = 'Age' , ascending = False)[['Name','Club','Nationality','Overall', 'Age' ]].head()
# Youngest players
df.sort_values(by = 'Age' , ascending = True)[['Name','Club','Nationality','Overall', 'Age' ]].head()
# Best Freekick takers
df.sort_values(by = 'FKAccuracy' , ascending = False)[['Name','Club','Nationality','Overall', 'Age','FKAccuracy']].head()
# Best Penalty takers
df.sort_values(by = 'Penalties' , ascending = False)[['Name','Club','Nationality','Overall', 'Age','Penalties']].head()
# Players with best ball control
df.sort_values(by = 'BallControl' , ascending = False)[['Name','Club','Nationality','Overall', 'Age','BallControl']].head()
# Quick players
df.sort_values(by = 'SprintSpeed' , ascending = False)[['Name','Club','Nationality','Overall', 'Age','SprintSpeed']].head()
# Age distrbution among famous clubs
clubs = ['Chelsea' , 'Arsenal', 'Juventus', 'Paris Sain-Germain' ,'FC Bayern München',
'Real Madrid' , 'FC Barcelona' , 'Borussia Dortmund' , 'Manchester United' ,
'FC Porto', 'Liverpool', 'Manchester City']
club_age = df.loc[df['Club'].isin(clubs) & df['Age']]
plt.figure(1 , figsize = (15 ,7))
sns.boxplot(x = 'Club' , y = 'Age' , data = club_age,palette='rocket')
plt.title('Age Distribution in famous clubs')
plt.xticks(rotation = 50)
plt.show()
# Overall Rating of the clubs
club_rating = df.loc[df['Club'].isin(clubs) & df['Overall']]
plt.figure(1 , figsize = (15 ,7))
sns.boxplot(x = 'Club' , y = 'Overall' , data = club_rating, palette='rocket')
plt.title('Overall Rating Distribution in famous clubs')
plt.xticks(rotation = 50)
plt.show()
# Best club
best_dict = {}
for club in df['Club'].unique():
overall_rating = df['Overall'][df['Club'] == club].sum()
best_dict[club] = overall_rating
best_club = pd.DataFrame.from_dict(best_dict,orient='index', columns = ['overall'])
best_club['club'] = best_club.index
best_club = best_club.sort_values(by = 'overall' , ascending = False)
plt.figure(1 , figsize = (15 , 6))
sns.barplot(x = 'club' , y = 'overall' , data = best_club.head(5),palette='rocket')
plt.xlabel("Club", size = 15)
plt.ylabel('Sum of Overall Rating of players in club', size = 15)
plt.title('Clubs with best Players (sum of overall ratings of players per club)', size = 25)
plt.ylim(2450 , 2600)
plt.show()
# Wage vs Overall
plt.figure(figsize=(25,10))
sns.barplot(data=df.head(30),y='Wage',x='Overall',palette='rocket')
plt.title('Wage vs Overall',size=35)
plt.xlabel("Overall",size=25)
plt.ylabel('Wage',size=25)
plt.show()
# Wage vs Potental
plt.figure(figsize=(25,10))
sns.barplot(data=df.head(30),y='Wage',x='Potential',palette='rocket')
plt.title('Wage vs Potential',size=35)
plt.xlabel("Wage",size=25)
plt.ylabel('Potential',size=25)
plt.show()
# Count of players for each position
df3=pd.DataFrame(df['Position'].value_counts())
df3.reset_index(inplace=True)
df3.rename(columns={'index':'Position',"Position":"Count"},inplace=True)
df3.head(10)
plt.figure(figsize=(20,10))
sns.countplot(x='Position',data=df)
plt.title("Player count per position",size=25)
plt.xlabel("Position",size=20)
plt.ylabel("Count",size=20)
plt.show()
print("Maximum players were found with position {} with a count of {}".format(df3['Position'][0],df3['Count'][0]))
import plotly.graph_objects as go
fig=go.Figure(data=go.Scatterpolar(r=df3['Count'].head(),theta=['ST','GK','CB', 'CM','LB'],fill='toself'))
fig.update_layout(polar=dict(radialaxis=dict(visible=True),),showlegend=False)
fig.show()
#Players whose contract has ended (till 2019)
df2=df[df['Contract Valid Until']<='2019'][['ID','Name','Age','Nationality','Contract Valid Until','Position','Club']]
print("Total Contracts ended:",df2.shape[0])
df2[['ID','Name','Age','Nationality','Position','Club']].head(15)
# Players' age when their contract ended (2019)
plt.figure(figsize=(20,10))
sns.barplot(data=df2.head(15),x=df2['Name'].head(15),y=df2['Age'].head(15))
plt.title('Age of players when their contract ended (top 15)',size=20)
plt.xlabel("Names",size=25)
plt.ylabel('Age',size=25)
plt.show()
print("Average age of players whose contract ended:",int(df2['Age'].mean()))
#Contracts ended per year
plt.figure(figsize=(5,5))
sns.countplot(x=df2['Contract Valid Until'])
plt.title('No. of contracts ended in previous years',size=15)
plt.xlabel("Years",size=15)
plt.ylabel('Count',size=15)
plt.show()
print("Contracts ended in 2019:",df2['Contract Valid Until'].value_counts()[0])
print("Contracts ended in 2018:",df2['Contract Valid Until'].value_counts()[1])
# Contracts ended for various positions
d=pd.DataFrame(df2['Position'].value_counts())
d.reset_index(inplace=True)
d.rename(columns={"index":"Position","Position":"Count"},inplace=True)
d.head()
plt.figure(figsize=(20,10))
sns.countplot(x=df2['Position'])
plt.title('No. of contracts ended per position',size=20)
plt.xlabel("Years",size=15)
plt.ylabel('Count',size=15)
plt.show()
print("Most contracts ended for position {} and their count is: {}".format(d['Position'][0],d['Count'][0]))
#Contracts ended per club
d1=pd.DataFrame(df2['Club'].value_counts())
d1.reset_index(inplace=True)
d1.rename(columns={"index":"Club","Club":"Count"},inplace=True)
d1.head(10)
plt.figure(figsize=(15,10))
plt.bar('Club','Count',data=d1.head(10),width=0.7,color='red')
plt.title('No. of contracts ended for clubs (top 10)',size=20)
plt.xlabel("Clubs",size=25)
plt.xticks(rotation = 90,fontsize=15, fontname='sans-serif')
plt.ylabel('Count',size=25)
plt.show()
print("\nMost contracts ended for club {} and its count is: {}".format(d1['Club'][0],d1['Count'][0]))
df1=pd.DataFrame(df['Nationality'].value_counts())
df1.reset_index(inplace=True)
df1.rename(columns={'index':"Nation","Nationality":"Count"},inplace=True)
df1.head()
# Most players from a country
plt.figure(figsize=(25,10))
sns.barplot(data=df1.head(20),x='Nation',y='Count',palette='rocket')
plt.title('Most players from a country (top 20)',size=35)
plt.xlabel("Nations",size=25)
plt.ylabel('Count',size=25)
plt.show()
# Pie chart depiction of countries with most players
plt.figure()
fig1,ax1=plt.subplots(figsize=(8,8))
plt.subplots_adjust(left=0.5,wspace=0.2)
ax1.pie(df1['Count'].head(7),explode=(0.1,0,0,0,0,0,0,),labels=df1['Nation'].head(7),autopct='%1.1f%%',shadow=True,startangle=90)
ax1.axis('equal')
plt.legend(df1['Nation'].head(7),loc="best")
plt.tight_layout()
plt.show()
import folium
from geopy.geocoders import Nominatim
import requests
lat=[]
lng=[]
geolocator=Nominatim(user_agent='foursquare_api')
for nm in df1['Nation'].head(20):
location=geolocator.geocode(str(nm),timeout=10)
lat.append(location.latitude)
lng.append(location.longitude)
for_map=pd.DataFrame(df1.head(20))
for_map['Latitude']=lat
for_map['Longitude']=lng
for_map.head()
wm=folium.Map(zoom_start=2,location=[0,0])
mp=folium.map.FeatureGroup()
for i,j,k in zip(for_map['Latitude'],for_map['Longitude'],for_map['Count']):
mp.add_child(folium.CircleMarker(location=[i,j],radius=5,color='red',fill_color='Yellow'))
folium.Marker([i,j],popup='Player Count '+str(k)).add_to(mp)
wm.add_child(mp)
wm
Here, we select some attributes of the players to predict the 'Overall' so as to get the total performance measure of the players.
predictors=df[['Overall','Potential','Value','Wage','Skill Moves',
'Crossing','Finishing','HeadingAccuracy','ShortPassing','Volleys','Dribbling',
'Curve','FKAccuracy','LongPassing','BallControl','Acceleration','SprintSpeed',
'Agility','Reactions','Balance','ShotPower','Jumping','Stamina','Strength',
'LongShots', 'Aggression', 'Interceptions', 'Positioning', 'Vision',
'Penalties', 'Composure', 'Marking', 'StandingTackle', 'SlidingTackle',
'GKDiving', 'GKHandling', 'GKKicking', 'GKPositioning', 'GKReflexes']]
predictors.head()
As seen from the scatter and regression plots above, the predictor's columns vary linearly with 'Overall'. Hence we select the 'Multiple Linear Regression' algorithm.
from sklearn import linear_model
from sklearn.metrics import r2_score
ms=np.random.rand(len(df))<0.75
train=predictors[ms]
test=predictors[~ms]
x=np.asanyarray(train[['Potential','Value','Wage','Skill Moves','Crossing',
'Finishing','HeadingAccuracy','ShortPassing','Volleys','Dribbling',
'Curve','FKAccuracy','LongPassing','BallControl','Acceleration','SprintSpeed',
'Agility','Reactions','Balance','ShotPower','Jumping','Stamina','Strength',
'LongShots', 'Aggression', 'Interceptions', 'Positioning', 'Vision',
'Penalties', 'Composure', 'Marking', 'StandingTackle', 'SlidingTackle',
'GKDiving', 'GKHandling', 'GKKicking', 'GKPositioning', 'GKReflexes']])
y=np.asanyarray(train['Overall'])
x
y
regr=linear_model.LinearRegression()
regr.fit(x,y)
print("Coefficients:",regr.coef_)
print("\nIntercept:",regr.intercept_)
Y=regr.predict(test[['Potential','Value','Wage','Skill Moves','Crossing',
'Finishing','HeadingAccuracy','ShortPassing','Volleys','Dribbling',
'Curve','FKAccuracy','LongPassing','BallControl','Acceleration','SprintSpeed',
'Agility','Reactions','Balance','ShotPower','Jumping','Stamina','Strength',
'LongShots', 'Aggression', 'Interceptions', 'Positioning', 'Vision',
'Penalties', 'Composure', 'Marking', 'StandingTackle', 'SlidingTackle',
'GKDiving', 'GKHandling', 'GKKicking', 'GKPositioning', 'GKReflexes']])
x1=np.asanyarray(test[['Potential','Value','Wage','Skill Moves','Crossing',
'Finishing','HeadingAccuracy','ShortPassing','Volleys','Dribbling',
'Curve','FKAccuracy','LongPassing','BallControl','Acceleration','SprintSpeed',
'Agility','Reactions','Balance','ShotPower','Jumping','Stamina','Strength',
'LongShots', 'Aggression', 'Interceptions', 'Positioning', 'Vision',
'Penalties', 'Composure', 'Marking', 'StandingTackle', 'SlidingTackle',
'GKDiving', 'GKHandling', 'GKKicking', 'GKPositioning', 'GKReflexes']])
y1=np.asanyarray(test['Overall'])
a={"Predicted Values":Y,"Actual Values":y1}
comp=pd.DataFrame(a)
comp=comp.astype({"Predicted Values":'int64'})
comp.set_index(["Predicted Values","Actual Values"],inplace=True)
comp.head()
f1=np.mean((Y-y1)**2)
print("Residual sum of squares: {}".format(f1))
f2=regr.score(x1,y1)
print("Variance Score:",f2)
f3=np.mean(np.absolute(Y-y1))
print("Mean absolute error:",f3)
f4=r2_score(Y,y1)
print("R2-Score:",f4)
print("R2-Score Percentage",round(f4*100,3),"%")
from sklearn import metrics
f5=np.sqrt(metrics.mean_squared_error(Y,y1))
print("MSE:",f5)
a={"Feature":["Residual Sum Of Squares","Variance Score","Mean Absolute Error","Mean Squared Error","R2-Score Percentage"],
"Value":[round(f1,3),round(f2,3),round(f3,3),round(f5,3),round(f4*100,3)]}
summary=pd.DataFrame(a)
summary.set_index(['Feature','Value'],inplace=True)
summary
Here, we cluster the players as per their potenial and overall performance, so as to assign them with certain labels.
We will use the k-means clustering algorithm.
from sklearn.cluster import KMeans
We tend to cluster the players into 3 major categories:
predictors1=df[['Overall','Potential']]
from sklearn.preprocessing import StandardScaler
X=predictors1.values[:,1:]
clus_dataset=StandardScaler().fit_transform(X)
# Original Values
X
# Normalized Values
clus_dataset
kMeans=KMeans(init='k-means++',n_clusters=4)
kMeans.fit(clus_dataset)
# Generating labels
labels=kMeans.labels_
predictors1['Labels']=labels
df['Labels']=labels
cluster1=df.loc[df['Labels']==0,df.columns[[1]+[4]+[5]+[82]]]
pos1=cluster1['Potential'].mean()
ov1=cluster1['Overall'].mean()
mem1=cluster1.shape[0]
print("Mean Potential of cluster:",pos1)
print("Mean Overall of cluster:",ov1)
print("Members in cluster 1:",mem1)
cluster1.head()
cluster2=df.loc[df['Labels']==1,df.columns[[1]+[4]+[5]+[82]]]
pos2=cluster2['Potential'].mean()
ov2=cluster2['Overall'].mean()
mem2=cluster2.shape[0]
print("Mean Potential of cluster:",pos2)
print("Mean Overall of cluster:",ov2)
print("Members in cluster 1:",mem2)
cluster2.head()
cluster3=df.loc[df['Labels']==2,df.columns[[1]+[4]+[5]+[82]]]
pos3=cluster3['Potential'].mean()
ov3=cluster3['Overall'].mean()
mem3=cluster3.shape[0]
print("Mean Potential of cluster:",pos3)
print("Mean Overall of cluster:",ov3)
print("Members in cluster 1:",mem3)
cluster3.head()
cluster4=df.loc[df['Labels']==3,df.columns[[1]+[4]+[5]+[82]]]
pos4=cluster4['Potential'].mean()
ov4=cluster4['Overall'].mean()
mem4=cluster4.shape[0]
print("Mean Potential of cluster:",pos4)
print("Mean Overall of cluster:",ov4)
print("Members in cluster 1:",mem4)
cluster4.head()
As per the results, we assign the following tags to the clusters:
clus={"Index":[1,2,3,4],"Tags":["Best","Good","Average","Below Average"],
"Potential Mean":[round(pos1,3),round(pos3,3),round(pos4,3),round(pos2,3)],
"Overall Mean":[round(ov1,3),round(ov3,3),round(ov4,3),round(ov2,3)],
"Players":[mem1,mem3,mem4,mem2],"Cluster":[3,2,4,1]}
clus_sum=pd.DataFrame(clus)
clus_sum.set_index(['Index'],inplace=True)
clus_sum
sns.set_style('whitegrid')
sns.lmplot('Overall','Potential',data=df,hue='Labels',palette="husl",size=6,aspect=1,fit_reg=False)